rm(list = ls())

library(plyr)                           #the order for loading plyr, dplyr matters, version 1.8.4
library(dplyr)                          #0.7.4
library(readr)                          #1.1.1
library(tidytext)                       #0.1.7
library(Matrix)                         #1.2.6
library(stm)                            #version 1.1.3 -- this appears to affect stop word removal, if different version, keywords will only have slightly different order (and might have words like 'im')
library(lubridate)                      #1.6.0
library(stringr)                        #1.3.0
library(tm)                             #0.7.3

## ## to load parrot
## install.packages(c("stm","ggplot2","gridExtra","Matrix","reshape2","ForeCA","devtools","magrittr","RSpectra","irlba","gtable","fda"))
## install.packages(c("dplyr","readr","tidyr","CCA"))
## install.packages(c("knitr"))

set.seed(987654321)

in_path <- ""

arabic_geo_tweets <- read.table(
    paste0(in_path, "arabic_usgeo_counts_20150901_to_20170213.txt.gz"),
    sep="\t", quote="\"", comment.char="", fill=T, header=F)
names(arabic_geo_tweets) <- c(
    "date","userid","user_lang","lang","usa","state",
    "arabic_name","nonarabic_name","missing_coords","tweets"
)

arabic_geo_text <- read_delim(
    paste0(in_path, "arabic_sample_text_20150901_to_20170213.txt.gz"),
    delim="\t", quote="", comment="", col_names=F)
names(arabic_geo_text) <- c(
    "userid","date","tweet_id","usa","place_state_abb",
    "lang","missing_coords","text"
)

input_data_prep <- subset(
    arabic_geo_text,
    as.Date(date, "%Y_%m_%d") >= "2016-11-07" & as.Date(date, "%Y_%m_%d") < "2017-01-27"
    # election to feb 13 is all the data processed for this purpose
    & missing_coords=="False" # rerun from beginning for precise vs. coarse
    ## coarse will be much more data and take a longer time
    & gsub("\"", "", userid) %in% subset(
                                      arabic_geo_tweets,
                                      usa=="True" & state!="" & arabic_name=="True"
                                  )$userid
    & !grepl("RT", text)
) %>%
    group_by(userid) %>%
    mutate(
        n_tweets = n()
    ) %>%
        ungroup() %>%
        sample_n(size=nrow(.)/5, replace=FALSE, weight=1 / n_tweets) #/5 for precise /100 for coarse!!!


#### SETTINGS
working_directory <- ""
setwd(working_directory)

## see github.com/wilryh/parrot for latest version, contact if need exact version for this article
library(devtools)
parrot <- as.package("")
load_all(parrot)


processed <- textProcessor(
    input_data_prep$text,
    data.frame(input_data_prep),
    wordLengths=c(2,Inf),
    removestopwords=T,
    lowercase=T, stem=F
    )
out <- prepDocuments(
    processed$documents, processed$vocab, processed$meta
    )

tdm <- doc_to_tdm_fast(out$documents, binary=TRUE)
tdm <- tdm[Matrix::rowSums(tdm) > 0,]


filter_rt_urls <- FALSE
if (filter_rt_urls) {

    has_url <- grepl("/", out$meta$full_text)
    has_at <- grepl("@", out$meta$full_text)

    out$meta <- out$meta[Matrix::rowSums(
        tdm[,out$vocab %in% c("rt","http")]
        )==0 & !has_url & !has_at,]
    tdm <- tdm[Matrix::rowSums(
        tdm[,out$vocab %in% c("rt","http")]
        )==0 & !has_url & !has_at,]

    thevocab <- out$vocab[Matrix::colSums(tdm)>0]
    tdm <- tdm[,Matrix::colSums(tdm)>0]

} else {
    thevocab <- out$vocab
}

word_counts <- Matrix::colSums(tdm)
if (length(word_counts) > 10000) {
tdm <- tdm[,word_counts >= sort(word_counts, decreasing=T)[10000]]
thevocab <- thevocab[word_counts >= sort(word_counts, decreasing=T)[10000]]
}

n_dimensions <- round(exp(1)^(log(ncol(tdm))/2 + 1))

cat("\nScaling..\n")
scores <- scale_text(
    meta=out$meta,
    tdm=tdm,
    vocab=thevocab,
    compress_fast=TRUE,                    #TRUE for small data sets
    n_dimension_compression=n_dimensions,
    n_dimension_pivot1=n_dimensions,
    n_dimension_pivot2=n_dimensions,
    pivot1=3, pivot2=3
    )

get_keywords <- function(scores, n_dimensions, n_words=15) {
    ## note: at the time of this writing, parrot keywords were chosen with this function
    for (i in 1:n_dimensions) {
        print(knitr::kable(
            dplyr::data_frame(
                ## `<--`=head(
                ##     scores$vocab[order(
                ##         -scores$word_scores[,i] * scores$word_counts^(1/4),
                ##         decreasing=T
                ##         )],
                ##     n=n_words
                ##     ),
                `<-`=head(
                    scores$vocab[order(
                        -scores$pivot_scores[,i] * scores$word_counts^(1/2),
                        decreasing=T
                        )],
                    n=n_words
                    ),
                `->`=head(
                    scores$vocab[order(
                        scores$pivot_scores[,i] * scores$word_counts^(1/2),
                        decreasing=T
                        )],
                    n=n_words
                    )## ,
                ## `-->`=head(
                ##     scores$vocab[order(
                ##         scores$word_scores[,i] * scores$word_counts^(1/4),
                ##         decreasing=T
                ##         )],
                ##     n=n_words
                ##     )
                ), align="c",format="pandoc",
            caption=paste("Dimension", i, "keywords")
            ))
        cat("\n")
    }


}

get_keywords(scores, n_dimensions=5)
